Raw data
lb.filelistZE <- list.files(here("data/RNA-Seq/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
##ZE is missing 204 in L002, and need to remove P11562_112 from both
lb.filelistZE <- (str_subset(lb.filelistZE, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_112_S11_L00", negate = TRUE))
lb.filterlistZEuniq <- (str_subset(lb.filelistZE, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_204"))
lb.filterlistZE <- str_subset(lb.filelistZE,"L001_sortmerna_trimmomatic")
lb.filelistZEWorking <- (str_subset(lb.filelistZE, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_204", negate = TRUE))
lb.filterlistZE1 <- str_subset(lb.filelistZEWorking,"L001_sortmerna_trimmomatic")
names(lb.filterlistZE1) <- sapply(lapply(strsplit(lb.filterlistZE1,"_"),"[",1:2),paste,collapse="_")
lb.filterlistZE2 <- str_subset(lb.filelistZEWorking,"L002_sortmerna_trimmomatic")
names(lb.filterlistZE2) <- sapply(lapply(strsplit(lb.filterlistZE2,"_"),"[",1:2),paste,collapse="_")
names(lb.filterlistZEuniq) <- sapply(lapply(strsplit(lb.filterlistZEuniq,"_"),"[",1:2),paste,collapse="_")
names(lb.filterlistZE1) <- str_replace(names(lb.filterlistZE1),".*salmon/","")
names(lb.filterlistZE2) <- str_replace(names(lb.filterlistZE2),".*salmon/","")
names(lb.filterlistZEuniq) <- str_replace(names(lb.filterlistZEuniq),".*salmon/","")
part1 <- suppressMessages(round(tximport(files = lb.filterlistZE1,
type = "salmon",txOut=TRUE)$counts))
part2 <- suppressMessages(round(tximport(files = lb.filterlistZE2,
type = "salmon",txOut=TRUE)$counts))
part3 <- suppressMessages(round(tximport(files = lb.filterlistZEuniq,
type = "salmon",txOut=TRUE)$counts))
all(colnames(part1)==colnames(part2))
## [1] TRUE
countsZE <- part1 + part2
countsREARRANGE <- cbind(countsZE[,54:55])
countsZE <- countsZE[,-54:-55]
countsZE <- cbind(countsZE, part3, countsREARRANGE)
#################end of ZE
lb.filelistSomaticEmbGerm <- list.files("/mnt/picea/projects/spruce/uegertsdotter/SE-germinants/salmon",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
#################start of SomaticEmbGerm
##splitting data in order to sum the counts
lb.filterlistSomaticEmbGerm <- lb.filelistSomaticEmbGerm
lb.filterlistSomaticEmbGerm1 <- (str_subset(lb.filterlistSomaticEmbGerm, "AC7", negate = TRUE))
names(lb.filterlistSomaticEmbGerm1) <- sapply(lapply(strsplit(lb.filterlistSomaticEmbGerm1,"_"),"[",4:5),paste,collapse="_")
lb.filterlistSomaticEmbGerm2 <- (str_subset(lb.filterlistSomaticEmbGerm, "BC7", negate = TRUE))
names(lb.filterlistSomaticEmbGerm2) <- sapply(lapply(strsplit(lb.filterlistSomaticEmbGerm2,"_"),"[",4:5),paste,collapse="_")
part1 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmbGerm1,
type = "salmon",txOut=TRUE)$counts))
part2 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmbGerm2,
type = "salmon",txOut=TRUE)$counts))
all(colnames(part1)==colnames(part2))
## [1] TRUE
countsEmbGerm <- part1 + part2
#################end of EmbGerm
lb.filelist29Seed <- list.files("/mnt/picea/projects/spruce/uegertsdotter/29_Spruce_Seeds_Project/salmon",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
lb.filterlist29Seed <- lb.filelist29Seed
names(lb.filterlist29Seed) <- sapply(lapply(strsplit(lb.filterlist29Seed,"_"),"[",7:8),paste,collapse="_")
counts29Seed <- suppressMessages(round(tximport(files = lb.filterlist29Seed,
type = "salmon",txOut=TRUE)$counts))
###need to recorder the ones with a letter infront of them
lb.filelistSomaticEmb <- list.files("/mnt/picea/projects/spruce/uegertsdotter/22_Somatic_Embryogenesis_Project/salmon",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
lb.filterlistSomaticEmb <- str_subset(lb.filelistSomaticEmb,"L002", negate = TRUE)
#################need to merge countsZE and part3, between column 53 and 54 (to become the new column 54)
lb.filterlistSomaticEmb1 <- str_subset(lb.filelistSomaticEmb,"L002_sortmerna_trimmomatic")
names(lb.filterlistSomaticEmb1) <- sapply(lapply(strsplit(lb.filterlistSomaticEmb1,"_"),"[",4:5),paste,collapse="_")
lb.filterlistSomaticEmb2 <- str_subset(lb.filelistSomaticEmb,"L002_sortmerna_trimmomatic")
names(lb.filterlistSomaticEmb2) <- sapply(lapply(strsplit(lb.filterlistSomaticEmb2,"_"),"[",4:5),paste,collapse="_")
lb.filterlistSomaticEmb3 <- str_subset(lb.filelistSomaticEmb,"L00", negate = TRUE)
names(lb.filterlistSomaticEmb3) <- sapply(lapply(strsplit(lb.filterlistSomaticEmb3,"_"),"[",7:8),paste,collapse="_")
names(lb.filterlistSomaticEmb1) <- str_replace(names(lb.filterlistSomaticEmb1),".*salmon/","")
names(lb.filterlistSomaticEmb2) <- str_replace(names(lb.filterlistSomaticEmb2),".*salmon/","")
part1 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmb1,
type = "salmon",txOut=TRUE)$counts))
part2 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmb2,
type = "salmon",txOut=TRUE)$counts))
part3 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmb3,
type = "salmon",txOut=TRUE)$counts))
all(colnames(part1)==colnames(part2))
## [1] TRUE
countsSomaticEmb <- part1 + part2
countsSomaticEmb <- cbind(part3, countsSomaticEmb)
################# end of SomaticEmb
Filter and select only one duplicate from each sample.
#removed P11562_112 row from sample list
samples <- filter(samples, !grepl("P11562_112",NGI.ID))
lb.filterlist <- c(lb.filterlistZE,lb.filterlist29Seed,lb.filterlistSomaticEmbGerm1,lb.filterlistSomaticEmb)
#####Filelists have been combined, however not able to proceed - NGI IDs do not match.
#####All samples should have the same Tissue Type in the Somatic and 29Seed - Time is not changed either - they should all be the same.
stopifnot(all(str_which(lb.filterlist, samples$NGI.ID) == 1:length(lb.filterlist)))
#print(str_which(lb.filterlist, samples$NGI.ID) == 1:length(lb.filterlist))
##assign names to the filtered filelist (which removed L002)
names(lb.filterlist) <- samples$NGI.ID
#lb.filterlist <- lb.filterlist[samples$Tissue %in% c("ZE","FMG","S","Normal","Aberrant","Non-EMB","PEM","DKM","SM","LSM","SD","ED","RO","PLS","ZE-R")] ##Are we only looking at ZE?
lb.filterlist <- lb.filterlist[samples$Experiment %in% c("Zygotic","29Seed","Somatic Embryogenesis Germinants","Somatic Embryogenesis")]
Read the expression at the gene level (there is one transcript per gene)
#lb.g <- suppressMessages(tximport(files = lb.filterlist,
# type = "salmon",txOut=TRUE))
#####redo this lb.g <- cbind(countsZE, countsSEED, countsSomaticEmbGerm, countsSomaticEmb) in the correct order
#####the order is ZE, Seeds, SomaticEmbGerm, SomaticEmb
lb.g <- cbind(countsZE, counts29Seed, countsEmbGerm, countsSomaticEmb)
counts <- lb.g
Raw Data QC analysis
Check how many genes are never expressed - reasonable level of non-expressed genes indicated.
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "11.1% percent (7331) of 66069 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(names(lb.filterlist),samples$NGI.ID),]),
aes(x,y,col=Experiment,fill=Tissue)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(names(lb.filterlist),samples$NGI.ID),]),
aes(x,y,col=Experiment,fill=)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 7331 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Tissue=samples$Tissue[match(ind,samples$NGI.ID)]) %>%
mutate(Experiment=samples$Experiment[match(ind,samples$NGI.ID)])
Color by Experiment
ggplot(dat,aes(x=values,group=ind,col=Tissue)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 3391332 rows containing non-finite values (stat_density).

Color by Time
ggplot(dat,aes(x=values,group=ind,col=Experiment)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 3391332 rows containing non-finite values (stat_density).

QC on the normalised data
PCA
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
3 first dimensions
Seems that different time points form small clusters and ZE and FMG tissue types appear to separate. These are Comp1 and Comp2 which explains the different between most of the sampels except for one B4 ZE sample. Appears to be an outlier. This seems to indicate that the Tissue and Time components explain the difference between samples.
#mar=c(5.1,4.1,4.1,2.1)
#scatterplot3d(pc$x[,1],
# pc$x[,2],
# pc$x[,3],
# xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
# ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
# zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
# color=pal[as.integer(samples$Tissue)],
# pch=c(17,19,15,13)[as.integer(samples$Experiment)])
#legend("topleft",
# fill=pal[1:nlevels(samples$Tissue)],
# legend=levels(samples$Tissue))
#legend("bottomright",
# pch=c(17,19,15,13),
# legend=levels(samples$Experiment))
#par(mar=mar)
samples$NGI.ID
## [1] "P11562_101" "P11562_102" "P11562_103" "P11562_104" "P11562_105"
## [6] "P11562_106" "P11562_107" "P11562_108" "P11562_110" "P11562_111"
## [11] "P11562_113" "P11562_114" "P11562_115" "P11562_116" "P11562_117"
## [16] "P11562_118" "P11562_119" "P11562_120" "P11562_121" "P11562_123"
## [21] "P11562_124" "P11562_125" "P11562_126" "P11562_127" "P11562_128"
## [26] "P11562_129" "P11562_130" "P11562_131" "P11562_133" "P11562_134"
## [31] "P11562_135" "P11562_136" "P11562_137" "P11562_138" "P11562_139"
## [36] "P11562_140" "P11562_141" "P11562_142" "P11562_143" "P11562_145"
## [41] "P11562_146" "P11562_147" "P11562_148" "P11562_149" "P11562_150"
## [46] "P11562_151" "P11562_153" "P11562_154" "P11562_155" "P11562_157"
## [51] "P11562_201" "P11562_202" "P11562_203" "P11562_204" "P11562_205"
## [56] "P11562_206" "P1790_101" "P1790_102" "P1790_103" "P1790_104"
## [61] "P1790_105" "P1790_106" "P1790_107" "P1790_108" "P1790_109"
## [66] "P1790_110" "P1790_111" "P1790_112" "P2228_101" "P2228_102"
## [71] "P2228_103" "P2228_104" "P2228_105" "P2228_106" "P2228_107"
## [76] "P2228_108" "P2228_109" "P2228_110" "P2228_111" "P2228_112"
## [81] "P2228_113" "P2228_114" "P2228_115" "P2228_117" "P2228_118"
## [86] "P2228_119" "P2228_120" "P464_202" "P464_203" "P464_204"
## [91] "P464_205" "P464_206" "P464_208" "P464_211" "P464_201B"
## [96] "P464_209B" "P464_207B" "P7614_301" "P7614_302" "P7614_303"
## [101] "P7614_304" "P7614_305" "P7614_306" "P7614_307" "P7614_308"
## [106] "P7614_309" "P7614_310" "P7614_311" "P7614_312" "P7614_313"
## [111] "P7614_314" "P7614_315" "P7614_316" "P7614_317" "P7614_318"
## [116] "P7614_319" "P7614_320" "P7614_321" "P7614_322" "P7614_323"
## [121] "P7614_324"
samples <- samples[!(samples$NGI.ID == "P11562_148"),]
2D
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples)
ggplot(pc.dat,aes(x=PC1,y=PC2,col=Tissue,shape=Experiment)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

Interactive PCA Plot
suppressPackageStartupMessages(library(plotly))
interplot <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Tissue,shape=Experiment,text=NGI.ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(interplot, tooltip = "all") %>% layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Heatmap
Filter for noise A cutoff at a VST value of 1 leaves about 32000 genes - is this adequate for the QA?
conds <- factor(paste(samples$Tissue,samples$Experiment))
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)


vstCutoff <- 6+1
- Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples. It appears that generally there is little difference between the samples across all genes
- a small difference is noticable between ZE tissues and FMG-S tissues, however generally the expression levels are relatively balanced apart from the one sample in FMG B4. Somatic samples had a small section of more highly expressed genes compared to ZE and FMG.
hm <- heatmap.2(t(scale(t(vst[sels[[vstCutoff]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal,cexCol = .2)

plot(as.hclust(hm$colDendrogram))

Perform the VST, aware
vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)